PAC - Global YouTube Statistics 2023


GRUPO 3:

- Elvis Dany Valentín Victorino

- Jonathan Jerry Castillo Rodríguez

- Juan Carlos Guerra Sandoval

- Raúl Raico Gallardo


Carga del Dataset

df<-read.csv('data/Global YouTube Statistics.csv', encoding ='UTF-8', sep = ',')

Exploración del DataFrame

Variables

names(df)
##  [1] "rank"                                   
##  [2] "Youtuber"                               
##  [3] "subscribers"                            
##  [4] "video.views"                            
##  [5] "category"                               
##  [6] "Title"                                  
##  [7] "uploads"                                
##  [8] "Country"                                
##  [9] "Abbreviation"                           
## [10] "channel_type"                           
## [11] "video_views_rank"                       
## [12] "country_rank"                           
## [13] "channel_type_rank"                      
## [14] "video_views_for_the_last_30_days"       
## [15] "lowest_monthly_earnings"                
## [16] "highest_monthly_earnings"               
## [17] "lowest_yearly_earnings"                 
## [18] "highest_yearly_earnings"                
## [19] "subscribers_for_last_30_days"           
## [20] "created_year"                           
## [21] "created_month"                          
## [22] "created_date"                           
## [23] "Gross.tertiary.education.enrollment...."
## [24] "Population"                             
## [25] "Unemployment.rate"                      
## [26] "Urban_population"                       
## [27] "Latitude"                               
## [28] "Longitude"

Contenido superior de DataFrame

head(df)
##   rank                   Youtuber subscribers  video.views         category
## 1    1                   T-Series   245000000 228000000000            Music
## 2    2             YouTube Movies   170000000            0 Film & Animation
## 3    3                    MrBeast   166000000  28368841870    Entertainment
## 4    4 Cocomelon - Nursery Rhymes   162000000 164000000000        Education
## 5    5                  SET India   159000000 148000000000            Shows
## 6    6                      Music   119000000            0              nan
##                        Title uploads       Country Abbreviation  channel_type
## 1                   T-Series   20082         India           IN         Music
## 2              youtubemovies       1 United States           US         Games
## 3                    MrBeast     741 United States           US Entertainment
## 4 Cocomelon - Nursery Rhymes     966 United States           US     Education
## 5                  SET India  116536         India           IN Entertainment
## 6                      Music       0           nan          nan         Music
##   video_views_rank country_rank channel_type_rank
## 1                1            1                 1
## 2          4055159         7670              7423
## 3               48            1                 1
## 4                2            2                 1
## 5                3            2                 2
## 6          4057944          NaN               NaN
##   video_views_for_the_last_30_days lowest_monthly_earnings
## 1                        2.258e+09                  564600
## 2                        1.200e+01                       0
## 3                        1.348e+09                  337000
## 4                        1.975e+09                  493800
## 5                        1.824e+09                  455900
## 6                              NaN                       0
##   highest_monthly_earnings lowest_yearly_earnings highest_yearly_earnings
## 1                  9.0e+06                6.8e+06               1.084e+08
## 2                  5.0e-02                4.0e-02               5.800e-01
## 3                  5.4e+06                4.0e+06               6.470e+07
## 4                  7.9e+06                5.9e+06               9.480e+07
## 5                  7.3e+06                5.5e+06               8.750e+07
## 6                  0.0e+00                0.0e+00               0.000e+00
##   subscribers_for_last_30_days created_year created_month created_date
## 1                        2e+06         2006           Mar           13
## 2                          NaN         2006           Mar            5
## 3                        8e+06         2012           Feb           20
## 4                        1e+06         2006           Sep            1
## 5                        1e+06         2006           Sep           20
## 6                          NaN         2013           Sep           24
##   Gross.tertiary.education.enrollment.... Population Unemployment.rate
## 1                                    28.1 1366417754              5.36
## 2                                    88.2  328239523             14.70
## 3                                    88.2  328239523             14.70
## 4                                    88.2  328239523             14.70
## 5                                    28.1 1366417754              5.36
## 6                                     NaN        NaN               NaN
##   Urban_population Latitude Longitude
## 1        471031528 20.59368  78.96288
## 2        270663028 37.09024 -95.71289
## 3        270663028 37.09024 -95.71289
## 4        270663028 37.09024 -95.71289
## 5        471031528 20.59368  78.96288
## 6              NaN      NaN       NaN

Filas y columnas del DataFrame

dim(df)
## [1] 995  28

Resúmen Estadístico

summary(df)
##       rank         Youtuber          subscribers         video.views       
##  Min.   :  1.0   Length:995         Min.   : 12300000   Min.   :0.000e+00  
##  1st Qu.:249.5   Class :character   1st Qu.: 14500000   1st Qu.:4.288e+09  
##  Median :498.0   Mode  :character   Median : 17700000   Median :7.761e+09  
##  Mean   :498.0                      Mean   : 22982412   Mean   :1.104e+10  
##  3rd Qu.:746.5                      3rd Qu.: 24600000   3rd Qu.:1.355e+10  
##  Max.   :995.0                      Max.   :245000000   Max.   :2.280e+11  
##                                                                            
##    category            Title              uploads           Country         
##  Length:995         Length:995         Min.   :     0.0   Length:995        
##  Class :character   Class :character   1st Qu.:   194.5   Class :character  
##  Mode  :character   Mode  :character   Median :   729.0   Mode  :character  
##                                        Mean   :  9187.1                     
##                                        3rd Qu.:  2667.5                     
##                                        Max.   :301308.0                     
##                                                                             
##  Abbreviation       channel_type       video_views_rank   country_rank   
##  Length:995         Length:995         Min.   :      1   Min.   :   1.0  
##  Class :character   Class :character   1st Qu.:    323   1st Qu.:  11.0  
##  Mode  :character   Mode  :character   Median :    916   Median :  51.0  
##                                        Mean   : 554249   Mean   : 386.1  
##                                        3rd Qu.:   3584   3rd Qu.: 123.0  
##                                        Max.   :4057944   Max.   :7741.0  
##                                        NA's   :1         NA's   :116     
##  channel_type_rank video_views_for_the_last_30_days lowest_monthly_earnings
##  Min.   :   1.0    Min.   :1.000e+00                Min.   :     0         
##  1st Qu.:  27.0    1st Qu.:2.014e+07                1st Qu.:  2700         
##  Median :  65.5    Median :6.408e+07                Median : 13300         
##  Mean   : 745.7    Mean   :1.756e+08                Mean   : 36886         
##  3rd Qu.: 139.8    3rd Qu.:1.688e+08                3rd Qu.: 37900         
##  Max.   :7741.0    Max.   :6.589e+09                Max.   :850900         
##  NA's   :33        NA's   :56                                              
##  highest_monthly_earnings lowest_yearly_earnings highest_yearly_earnings
##  Min.   :       0         Min.   :       0       Min.   :        0      
##  1st Qu.:   43500         1st Qu.:   32650       1st Qu.:   521750      
##  Median :  212700         Median :  159500       Median :  2600000      
##  Mean   :  589808         Mean   :  442257       Mean   :  7081814      
##  3rd Qu.:  606800         3rd Qu.:  455100       3rd Qu.:  7300000      
##  Max.   :13600000         Max.   :10200000       Max.   :163400000      
##                                                                         
##  subscribers_for_last_30_days  created_year  created_month       created_date  
##  Min.   :      1              Min.   :1970   Length:995         Min.   : 1.00  
##  1st Qu.: 100000              1st Qu.:2009   Class :character   1st Qu.: 8.00  
##  Median : 200000              Median :2013   Mode  :character   Median :16.00  
##  Mean   : 349079              Mean   :2013                      Mean   :15.75  
##  3rd Qu.: 400000              3rd Qu.:2016                      3rd Qu.:23.00  
##  Max.   :8000000              Max.   :2022                      Max.   :31.00  
##  NA's   :337                  NA's   :5                         NA's   :5      
##  Gross.tertiary.education.enrollment....   Population        Unemployment.rate
##  Min.   :  7.60                          Min.   :2.025e+05   Min.   : 0.750   
##  1st Qu.: 36.30                          1st Qu.:8.336e+07   1st Qu.: 5.270   
##  Median : 68.00                          Median :3.282e+08   Median : 9.365   
##  Mean   : 63.63                          Mean   :4.304e+08   Mean   : 9.279   
##  3rd Qu.: 88.20                          3rd Qu.:3.282e+08   3rd Qu.:14.700   
##  Max.   :113.10                          Max.   :1.398e+09   Max.   :14.720   
##  NA's   :123                             NA's   :123         NA's   :123      
##  Urban_population       Latitude        Longitude      
##  Min.   :    35588   Min.   :-38.42   Min.   :-172.10  
##  1st Qu.: 55908316   1st Qu.: 20.59   1st Qu.: -95.71  
##  Median :270663028   Median : 37.09   Median : -51.93  
##  Mean   :224214982   Mean   : 26.63   Mean   : -14.13  
##  3rd Qu.:270663028   3rd Qu.: 37.09   3rd Qu.:  78.96  
##  Max.   :842933962   Max.   : 61.92   Max.   : 138.25  
##  NA's   :123         NA's   :123      NA's   :123

Estructura del DatFrame

str(df)
## 'data.frame':    995 obs. of  28 variables:
##  $ rank                                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Youtuber                               : chr  "T-Series" "YouTube Movies" "MrBeast" "Cocomelon - Nursery Rhymes" ...
##  $ subscribers                            : int  245000000 170000000 166000000 162000000 159000000 119000000 112000000 111000000 106000000 98900000 ...
##  $ video.views                            : num  2.28e+11 0.00 2.84e+10 1.64e+11 1.48e+11 ...
##  $ category                               : chr  "Music" "Film & Animation" "Entertainment" "Education" ...
##  $ Title                                  : chr  "T-Series" "youtubemovies" "MrBeast" "Cocomelon - Nursery Rhymes" ...
##  $ uploads                                : int  20082 1 741 966 116536 0 1111 4716 493 574 ...
##  $ Country                                : chr  "India" "United States" "United States" "United States" ...
##  $ Abbreviation                           : chr  "IN" "US" "US" "US" ...
##  $ channel_type                           : chr  "Music" "Games" "Entertainment" "Education" ...
##  $ video_views_rank                       : int  1 4055159 48 2 3 4057944 5 44 630 8 ...
##  $ country_rank                           : num  1 7670 1 2 2 NaN 3 1 5 5 ...
##  $ channel_type_rank                      : num  1 7423 1 1 2 ...
##  $ video_views_for_the_last_30_days       : num  2.26e+09 1.20e+01 1.35e+09 1.98e+09 1.82e+09 ...
##  $ lowest_monthly_earnings                : num  564600 0 337000 493800 455900 ...
##  $ highest_monthly_earnings               : num  9.0e+06 5.0e-02 5.4e+06 7.9e+06 7.3e+06 ...
##  $ lowest_yearly_earnings                 : num  6.8e+06 4.0e-02 4.0e+06 5.9e+06 5.5e+06 ...
##  $ highest_yearly_earnings                : num  1.08e+08 5.80e-01 6.47e+07 9.48e+07 8.75e+07 ...
##  $ subscribers_for_last_30_days           : num  2e+06 NaN 8e+06 1e+06 1e+06 NaN NaN NaN 1e+05 6e+05 ...
##  $ created_year                           : num  2006 2006 2012 2006 2006 ...
##  $ created_month                          : chr  "Mar" "Mar" "Feb" "Sep" ...
##  $ created_date                           : num  13 5 20 1 20 24 12 29 14 23 ...
##  $ Gross.tertiary.education.enrollment....: num  28.1 88.2 88.2 88.2 28.1 NaN 88.2 63.2 81.9 88.2 ...
##  $ Population                             : num  1.37e+09 3.28e+08 3.28e+08 3.28e+08 1.37e+09 ...
##  $ Unemployment.rate                      : num  5.36 14.7 14.7 14.7 5.36 NaN 14.7 2.29 4.59 14.7 ...
##  $ Urban_population                       : num  4.71e+08 2.71e+08 2.71e+08 2.71e+08 4.71e+08 ...
##  $ Latitude                               : num  20.6 37.1 37.1 37.1 20.6 ...
##  $ Longitude                              : num  79 -95.7 -95.7 -95.7 79 ...

Valores no nulos por columna

non_null_count <- sapply(df, function(x) sum(!is.na(x)))
non_null_count
##                                    rank                                Youtuber 
##                                     995                                     995 
##                             subscribers                             video.views 
##                                     995                                     995 
##                                category                                   Title 
##                                     995                                     995 
##                                 uploads                                 Country 
##                                     995                                     995 
##                            Abbreviation                            channel_type 
##                                     995                                     995 
##                        video_views_rank                            country_rank 
##                                     994                                     879 
##                       channel_type_rank        video_views_for_the_last_30_days 
##                                     962                                     939 
##                 lowest_monthly_earnings                highest_monthly_earnings 
##                                     995                                     995 
##                  lowest_yearly_earnings                 highest_yearly_earnings 
##                                     995                                     995 
##            subscribers_for_last_30_days                            created_year 
##                                     658                                     990 
##                           created_month                            created_date 
##                                     995                                     990 
## Gross.tertiary.education.enrollment....                              Population 
##                                     872                                     872 
##                       Unemployment.rate                        Urban_population 
##                                     872                                     872 
##                                Latitude                               Longitude 
##                                     872                                     872

Matriz de valores faltantes (nulos)

library(naniar)
naniar::vis_miss(df)

Nuevo DataFrame: df2 (sin variables con valores nulos o vacíos)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df %>%
  select_if(~all(!is.na(.)) & all(. != ""))

Matriz de valores faltantes de df2

naniar::vis_miss(df2)

str(df2)
## 'data.frame':    995 obs. of  15 variables:
##  $ rank                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Youtuber                : chr  "T-Series" "YouTube Movies" "MrBeast" "Cocomelon - Nursery Rhymes" ...
##  $ subscribers             : int  245000000 170000000 166000000 162000000 159000000 119000000 112000000 111000000 106000000 98900000 ...
##  $ video.views             : num  2.28e+11 0.00 2.84e+10 1.64e+11 1.48e+11 ...
##  $ category                : chr  "Music" "Film & Animation" "Entertainment" "Education" ...
##  $ Title                   : chr  "T-Series" "youtubemovies" "MrBeast" "Cocomelon - Nursery Rhymes" ...
##  $ uploads                 : int  20082 1 741 966 116536 0 1111 4716 493 574 ...
##  $ Country                 : chr  "India" "United States" "United States" "United States" ...
##  $ Abbreviation            : chr  "IN" "US" "US" "US" ...
##  $ channel_type            : chr  "Music" "Games" "Entertainment" "Education" ...
##  $ lowest_monthly_earnings : num  564600 0 337000 493800 455900 ...
##  $ highest_monthly_earnings: num  9.0e+06 5.0e-02 5.4e+06 7.9e+06 7.3e+06 ...
##  $ lowest_yearly_earnings  : num  6.8e+06 4.0e-02 4.0e+06 5.9e+06 5.5e+06 ...
##  $ highest_yearly_earnings : num  1.08e+08 5.80e-01 6.47e+07 9.48e+07 8.75e+07 ...
##  $ created_month           : chr  "Mar" "Mar" "Feb" "Sep" ...

Nuevo DataFrame: df3 (con promedio de ganancias anuales)

#Se crea df3
df3 <- df2

#Se agrega la columna promedio de ganancias anuales
df3$promedio_yearly_earnings <- rowMeans(df3[c("highest_yearly_earnings", "lowest_yearly_earnings")], na.rm = TRUE)

Estructura de df3

str(df3)
## 'data.frame':    995 obs. of  16 variables:
##  $ rank                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Youtuber                : chr  "T-Series" "YouTube Movies" "MrBeast" "Cocomelon - Nursery Rhymes" ...
##  $ subscribers             : int  245000000 170000000 166000000 162000000 159000000 119000000 112000000 111000000 106000000 98900000 ...
##  $ video.views             : num  2.28e+11 0.00 2.84e+10 1.64e+11 1.48e+11 ...
##  $ category                : chr  "Music" "Film & Animation" "Entertainment" "Education" ...
##  $ Title                   : chr  "T-Series" "youtubemovies" "MrBeast" "Cocomelon - Nursery Rhymes" ...
##  $ uploads                 : int  20082 1 741 966 116536 0 1111 4716 493 574 ...
##  $ Country                 : chr  "India" "United States" "United States" "United States" ...
##  $ Abbreviation            : chr  "IN" "US" "US" "US" ...
##  $ channel_type            : chr  "Music" "Games" "Entertainment" "Education" ...
##  $ lowest_monthly_earnings : num  564600 0 337000 493800 455900 ...
##  $ highest_monthly_earnings: num  9.0e+06 5.0e-02 5.4e+06 7.9e+06 7.3e+06 ...
##  $ lowest_yearly_earnings  : num  6.8e+06 4.0e-02 4.0e+06 5.9e+06 5.5e+06 ...
##  $ highest_yearly_earnings : num  1.08e+08 5.80e-01 6.47e+07 9.48e+07 8.75e+07 ...
##  $ created_month           : chr  "Mar" "Mar" "Feb" "Sep" ...
##  $ promedio_yearly_earnings: num  5.76e+07 3.10e-01 3.44e+07 5.04e+07 4.65e+07 ...

Gráfico de Tendencias (Dispersión)

Gráfico de Dispersión de Promedio de ganacias anuales por números de vistas

# Carga la librería
library(ggplot2)


plot1 <- ggplot(df3, aes(x = video.views, y = promedio_yearly_earnings)) +
  geom_point() +
  geom_smooth(method = "lm", col = "red") +
  ggtitle("Tendencia entre \n Vistas de Videos y \n Ganancias Anuales") +
  theme_minimal()


plot2 <- ggplot(df3, aes(x = uploads, y = promedio_yearly_earnings)) +
  geom_point() +
  geom_smooth(method = "lm", col = "red") +
  ggtitle("Tendencia entre \n Cargas y \n Ganancias Anuales") +
  theme_minimal()

# Crea un gráfico de dispersión con línea de regresión para 'subscribers' vs 'average_yearly_earnings'
plot3 <- ggplot(df3, aes(x = subscribers, y = promedio_yearly_earnings)) +
  geom_point() +
  geom_smooth(method = "lm", col = "red") +
  ggtitle("Tendencia entre \n Suscriptores y \n Ganancias Anuales") +
  theme_minimal()

# Muestra los gráficos en una fila
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(plot1, plot2, plot3, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Mapa de Calor

# Carga las librerías
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
# Supongamos que df3 es tu conjunto de datos
# Puedes ajustar el código según la estructura exacta de tu conjunto de datos

# Selecciona las columnas relevantes
df3_corr <- df3[, c('promedio_yearly_earnings', 'video.views', 'uploads', 'subscribers')]

# Calcula la matriz de correlación
corr_matrix <- cor(df3_corr)

# Crea un mapa de calor de la matriz de correlación con tonalidades de rojo
corrplot(corr_matrix, method = "color", type = "upper", col = colorRampPalette(c("white", "darkred"))(50), addCoef.col = "white", tl.cex = 0.7)

Promedio de ganancias anuales por Categorías

# install.packages("plotly")
# install.packages("dplyr")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)

# Filtra las filas donde la categoría no es "nan"
df3_filtered <- df3 %>%
  filter(!is.na(category) & category != "nan")

# Calcula el promedio de las ganancias anuales por categoría
categories <- df3_filtered %>%
  group_by(category) %>%
  summarise(promedio_yearly_earnings = mean(promedio_yearly_earnings)) %>%
  arrange(desc(promedio_yearly_earnings))

# Crea el gráfico de barras con degradado de colores
fig <- plot_ly(
  data = categories,
  x = ~reorder(category, promedio_yearly_earnings),
  y = ~promedio_yearly_earnings,
  type = "bar",
  marker = list(color = ~promedio_yearly_earnings, colorscale = "Reds"),
  text = ~paste(round(promedio_yearly_earnings / 1000000, 2), "M"),
  hoverinfo = "text",
  height = 500
)

# Personaliza el diseño del gráfico
fig <- fig %>% layout(
  title = "Promedio de ganancia anual por Categoría",
  xaxis = list(title = "Categoría", titlefont_size = 16, categoryorder = "total descending"),
  yaxis = list(title = "Promedio de ganancia anual", titlefont_size = 16),
  showlegend = FALSE
)

# Muestra el gráfico
fig

Nube de Palabras de los Youtubers con más Suscriptores

# Carga las librerías
library(wordcloud2)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
# Puedes ajustar el código según la estructura exacta de tu conjunto de datos
df$Youtuber <- iconv(df$Youtuber, from = "UTF-8", to = "UTF-8", sub = "")
# Crea un objeto Corpus para procesar el texto
corpus <- Corpus(VectorSource(df$Youtuber))
# Limpia el texto de manera más exhaustiva
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to = "UTF-8", sub =
" ")))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(function(x) iconv(x,
## : transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(function(x) iconv(x,
## : transformation drops documents
# Crea un dataframe con el nombre del Youtuber y sus suscriptores
youtubers_data <- data.frame(Youtuber = sapply(corpus, function(x) as.character(x)),
Subscribers = df$subscribers)
library(colorRamps)
# Definir una paleta de colores de degradado rojo
red_palette <- colorRampPalette(c("White", "red"))
# Especifica lana fuente Arial
font <- "Arial Narrow "
# Crea el gráfico de nube de palabras con wordcloud2 y la fuente especificada
wordcloud2(data = youtubers_data, color = red_palette(100), backgroundColor = "black",
fontFamily = font)

Los 10 Youtubers con más Suscriptores (Millones)

library(ggplot2)
top10 <- head(df3, 10)
# Ajusta el ancho completo del gráfico
ggplot(top10, aes(x = reorder(Youtuber, -subscribers), y = subscribers, fill = subscribers)) +
  geom_bar(stat = "identity", width = 1.0) +  # Ajusta el ancho de las barras aquí
geom_text(aes(label = subscribers/1000000), vjust = -0.5, size = 2.4) +  # Agrega etiquetas
  labs(title = "Los 10 Youtubers con más Suscriptores (Millones)", x = "Youtuber", y = "suscripciones") +
  theme(
axis.text.x = element_text(angle = 45, hjust = 1),  # Ajusta el ángulo de las etiquetas en el eje x
    plot.title = element_text(size = 10),  # Ajusta el tamaño del título
    plot.margin = margin(l = 1, r = 1, unit = "pt")  # Ajusta los márgenes del gráfico
) +
  scale_fill_gradient(low = "pink", high = "darkred")  # Define el degradado de colores

Porcentaje de Canales por Categoría

df3_category <- df3[, "category", drop = FALSE]

df3_category_sin_nan <- df3_category[df3_category$category != "nan", , drop = FALSE]

str(df3_category_sin_nan)
## 'data.frame':    949 obs. of  1 variable:
##  $ category: chr  "Music" "Film & Animation" "Entertainment" "Education" ...
# Calcula la frecuencia de cada valor en 'category'
category_counts <- table(df3_category_sin_nan$category)

# Calcula los porcentajes
category_percentages <- prop.table(category_counts) * 100

# Crea un dataframe con la información
category_summary <- data.frame(
  category = names(category_counts),
  count = as.numeric(category_counts),
  percentage = as.numeric(category_percentages)
)

# Muestra el resultado
category_summary
##                 category count percentage
## 1       Autos & Vehicles     2  0.2107482
## 2                 Comedy    69  7.2708114
## 3              Education    45  4.7418335
## 4          Entertainment   241 25.3951528
## 5       Film & Animation    46  4.8472076
## 6                 Gaming    94  9.9051633
## 7          Howto & Style    40  4.2149631
## 8                 Movies     2  0.2107482
## 9                  Music   202 21.2855638
## 10       News & Politics    26  2.7397260
## 11 Nonprofits & Activism     2  0.2107482
## 12        People & Blogs   132 13.9093783
## 13        Pets & Animals     4  0.4214963
## 14  Science & Technology    17  1.7913593
## 15                 Shows    13  1.3698630
## 16                Sports    11  1.1591149
## 17              Trailers     2  0.2107482
## 18       Travel & Events     1  0.1053741
library(plotly)
library(dplyr)


# Calcula la frecuencia de cada categoría
category <- table(df3_category_sin_nan$category)

# Ordena las categorías por frecuencia ascendente
category <- sort(category)

# Crea el gráfico de pastel con plot_ly
pie_chart <- plot_ly(
  labels = names(category),
  values = category,
  type = "pie",
  marker = list(colors = colorRampPalette(c("green", "red"))(length(category)))
)

# Personaliza el diseño del gráfico
pie_chart <- pie_chart %>% layout(
  title = "Canales por Categorías",
  uniformtext = list(minsize = 10, mode = 'hide'),
  template = 'plotly_white'
)

# Muestra el gráfico
pie_chart

Los 10 Países con mayor cantidad de Youtubers

df3_Country <- df3[, "Country", drop = FALSE]

df3_Country_sin_nan <- df3_Country[df3_Country$Country != "nan", , drop = FALSE]

str(df3_Country_sin_nan)
## 'data.frame':    873 obs. of  1 variable:
##  $ Country: chr  "India" "United States" "United States" "United States" ...
library(dplyr)

country_counts <- table(df3_Country_sin_nan$Country)

print(country_counts)
## 
##          Afghanistan              Andorra            Argentina 
##                    1                    1                   13 
##            Australia           Bangladesh             Barbados 
##                    9                    1                    1 
##               Brazil               Canada                Chile 
##                   62                   15                    3 
##                China             Colombia                 Cuba 
##                    1                   11                    1 
##              Ecuador                Egypt          El Salvador 
##                    2                    2                    1 
##              Finland               France              Germany 
##                    1                    5                    6 
##                India            Indonesia                 Iraq 
##                  168                   28                    2 
##                Italy                Japan               Jordan 
##                    2                    5                    3 
##               Kuwait               Latvia             Malaysia 
##                    1                    1                    1 
##               Mexico              Morocco          Netherlands 
##                   33                    1                    3 
##             Pakistan                 Peru          Philippines 
##                    6                    1                   12 
##               Russia                Samoa         Saudi Arabia 
##                   16                    1                    9 
##            Singapore          South Korea                Spain 
##                    3                   17                   22 
##               Sweden          Switzerland             Thailand 
##                    4                    1                   18 
##               Turkey              Ukraine United Arab Emirates 
##                    4                    8                    7 
##       United Kingdom        United States            Venezuela 
##                   43                  313                    1 
##              Vietnam 
##                    3
top_10_countries <- head(sort(country_counts, decreasing = TRUE), 10)

# Muestra los 10 valores más altos
print(top_10_countries)
## 
##  United States          India         Brazil United Kingdom         Mexico 
##            313            168             62             43             33 
##      Indonesia          Spain       Thailand    South Korea         Russia 
##             28             22             18             17             16
library(plotly)

# Crea el gráfico de embudo
fig <- plot_ly(
  type = "funnel",
  x = top_10_countries,
  textinfo = "value+text",
  text = names(top_10_countries),
  marker = list(
    color = colorRampPalette(c("red", "pink"))(length(top_10_countries))
  )
)

# Personaliza el diseño del gráfico
fig <- fig %>% layout(
  title = "Top 10 de Países con más de Youtubers",
  template = 'plotly_white',
  funnelmode = "stack",  # Puedes ajustar según tus preferencias
  funnelgap = 0.2,  # Puedes ajustar según tus preferencias
  textfont = list(size = 12)  # Ajusta este valor según sea necesario
)

# Muestra el gráfico
fig
## Warning: 'layout' objects don't have these attributes: 'funnelmode', 'funnelgap', 'textfont'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Youtuber en el Mundo

library(dplyr)
library(plotly)

# Calcula la frecuencia de cada país
conteos_por_pais <- df3_Country_sin_nan %>%
  count(Country, name = "Contar_por_pais")


# Combina el resultado con el dataframe original
df3_Country_sin_nan_2 <- left_join(df3_Country_sin_nan, conteos_por_pais, by = "Country")

# Elimina duplicados en la variable Country
df3_Country_sin_nan_2 <- df3_Country_sin_nan_2 %>% 
  distinct(Country, .keep_all = TRUE)

# Define la escala de colores personalizada
custom_colorscale <- c(
  c(0, "#115f9a"),
  c(0.05, "#1984c5"),
  c(0.1, "#22a7f0"),
  c(0.2, "#48b5c4"),
  c(0.4, "#76c68f"),
  c(0.6, "#a6d75b"),
  c(0.7, "#c9e52f"),
  c(0.8, "#d0ee11"),
  c(1, "#f4f100")
)

# Crea el gráfico de coropletas
fig <- plot_ly(
  data = df3_Country_sin_nan_2,
  type = "choropleth",
  locations = ~Country,
  locationmode = "country names",
  z = ~Contar_por_pais,
  colorscale = custom_colorscale,
  hoverinfo = "text",
  text = ~paste("Nº Canales: ", Contar_por_pais),
  title = "Distribución de Canales en el Mundo"
)

# Personaliza el diseño del gráfico
fig <- fig %>% layout(
  title = list(text = "Distribución de Canales en el Mundo", x = 0.3),
  title_font = list(size = 18),
  height = 630,
  width = 1100,
  geo = list(
    showcoastlines = TRUE,
    coastlinecolor = "Black",
    showland = TRUE,
    landcolor = "LightGray",
    showocean = TRUE,
    oceancolor = "LightBlue"
  )
)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
# Muestra el gráfico
fig
## Warning: 'layout' objects don't have these attributes: 'title_font'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
## Warning: 'choropleth' objects don't have these attributes: 'title'
## Valid attributes include:
## 'autocolorscale', 'coloraxis', 'colorbar', 'colorscale', 'customdata', 'customdatasrc', 'featureidkey', 'geo', 'geojson', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'locationmode', 'locations', 'locationssrc', 'marker', 'meta', 'metasrc', 'name', 'reversescale', 'selected', 'selectedpoints', 'showlegend', 'showscale', 'stream', 'text', 'textsrc', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'z', 'zauto', 'zmax', 'zmid', 'zmin', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

Promedio de Ganancias anuales por País

library(plotly)
library(dplyr)

# Filtra las filas donde la categoría no es "nan"
df3_filtered <- df3 %>%
  filter(!is.na(Country) & Country != "nan")

# Calcula el promedio de las ganancias anuales por país
Country <- df3_filtered %>%
  group_by(Country) %>%
  summarise(promedio_yearly_earnings = mean(promedio_yearly_earnings)) %>%
  arrange(desc(promedio_yearly_earnings))

# Crea el gráfico de barras con degradado de colores
fig <- plot_ly(
  data = Country,
  x = ~reorder(Country, promedio_yearly_earnings),
  y = ~promedio_yearly_earnings,
  type = "bar",
  marker = list(color = ~promedio_yearly_earnings, colorscale = "Reds"),
  text = ~paste(Country, round(promedio_yearly_earnings / 1000000, 2), "M"),
  hoverinfo = "text",
  height = 500
)

# Personaliza el diseño del gráfico
fig <- fig %>% layout(
  title = "Ganancia anual promedio por país",
  xaxis = list(title = "País", titlefont_size = 16, categoryorder = "total descending"),
  yaxis = list(title = "Ganancia anual promedio", titlefont_size = 16),
  showlegend = FALSE
)

# Muestra el gráfico
fig

Ganancias Anuales Promedio de Youtubers por Suscriptores

library(dplyr)

df3_youtuber_ganancia <- df3
df3_top_10 <- df3_youtuber_ganancia %>%
  top_n(10, promedio_yearly_earnings)

# Muestra el nuevo conjunto de datos con las 10 cifras más altas
df3_top_10
##    rank                                    Youtuber subscribers  video.views
## 1     1                                    T-Series   245000000 228000000000
## 2     4                  Cocomelon - Nursery Rhymes   162000000 164000000000
## 3     5                                   SET India   159000000 148000000000
## 4    16                                    Sony SAB    83000000 101000000000
## 5    22                                      Zee TV    70500000  73139054467
## 6    84                                 GR6 EXPLODE    38900000  25154232306
## 7   140                                    StarPlus    32000000  26800674545
## 8   303                         KL BRO Biju Rithvik    22800000  17988347989
## 9   418                                DaFuq!?Boom!    19600000   7906181776
## 10  496 \xfd\xfd\xfd\xfd\xfd\xfd\xfd\xfd\xfd KIMPRO    17700000  19206701832
##            category                                       Title uploads
## 1             Music                                    T-Series   20082
## 2         Education                  Cocomelon - Nursery Rhymes     966
## 3             Shows                                   SET India  116536
## 4             Shows                                    Sony SAB   71270
## 5     Entertainment                                      Zee TV  129204
## 6             Music                                 GR6 EXPLODE    3043
## 7     Entertainment                                    StarPlus   44892
## 8     Entertainment                         KL BRO Biju Rithvik    1841
## 9  Film & Animation                                DaFuq!?Boom!     214
## 10              nan \xfd\xfd\xfd\xfd\xfd\xfd\xfd\xfd\xfd KIMPRO    1646
##          Country Abbreviation  channel_type lowest_monthly_earnings
## 1          India           IN         Music                  564600
## 2  United States           US     Education                  493800
## 3          India           IN Entertainment                  455900
## 4          India           IN Entertainment                  414300
## 5          India           IN Entertainment                  426800
## 6         Brazil           BR         Music                  408700
## 7          India           IN Entertainment                  416800
## 8            nan          nan Entertainment                  508100
## 9  United States           US Entertainment                  576000
## 10   South Korea           KR        People                  850900
##    highest_monthly_earnings lowest_yearly_earnings highest_yearly_earnings
## 1                   9000000                6800000               108400000
## 2                   7900000                5900000                94800000
## 3                   7300000                5500000                87500000
## 4                   6600000                5000000                79600000
## 5                   6800000                5100000                81900000
## 6                   6500000                4900000                78500000
## 7                   6700000                5000000                80000000
## 8                   8100000                6100000                97600000
## 9                   9200000                6900000               110600000
## 10                 13600000               10200000               163400000
##    created_month promedio_yearly_earnings
## 1            Mar                 57600000
## 2            Sep                 50350000
## 3            Sep                 46500000
## 4            Aug                 42300000
## 5            Dec                 43500000
## 6            Aug                 41700000
## 7            May                 42500000
## 8            Jul                 51850000
## 9            Jun                 58750000
## 10           Nov                 86800000
df3_top_10$Youtuber <- iconv(df3_top_10$Youtuber, from = "UTF-8", to = "UTF-8", sub = "")

library(ggplot2)

# Ajusta el tamaño del texto
ggplot(df3_top_10, aes(x = Youtuber, y = subscribers, fill = promedio_yearly_earnings)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0("$", promedio_yearly_earnings/1000000,"M")), vjust = -0.5, size = 2.4) +  # Ajusta el tamaño del texto
  scale_fill_gradient(low = "lightcoral", high = "darkred") +
  theme_minimal() +
  labs(
    title = "Ganancias Anuales Promedio de Youtubers por Suscriptores",
    x = "Youtuber",
    y = "Subscriptores",
    #caption = "Ganancia anual promedio"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))